{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# use natural language toolkit\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem.lancaster import LancasterStemmer\n",
    "# word stemmer\n",
    "stemmer = LancasterStemmer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12 sentences in training data\n"
     ]
    }
   ],
   "source": [
    "# 3 classes of training data\n",
    "training_data = []\n",
    "training_data.append({\"class\":\"greeting\", \"sentence\":\"how are you?\"})\n",
    "training_data.append({\"class\":\"greeting\", \"sentence\":\"how is your day?\"})\n",
    "training_data.append({\"class\":\"greeting\", \"sentence\":\"good day\"})\n",
    "training_data.append({\"class\":\"greeting\", \"sentence\":\"how is it going today?\"})\n",
    "\n",
    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"have a nice day\"})\n",
    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"see you later\"})\n",
    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"have a nice day\"})\n",
    "training_data.append({\"class\":\"goodbye\", \"sentence\":\"talk to you soon\"})\n",
    "\n",
    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"make me a sandwich\"})\n",
    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"can you make a sandwich?\"})\n",
    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"having a sandwich today?\"})\n",
    "training_data.append({\"class\":\"sandwich\", \"sentence\":\"what's for lunch?\"})\n",
    "print (\"%s sentences in training data\" % len(training_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Corpus words and counts: {'ar': 1, 'yo': 1, 'talk': 1, 'day': 4, 'sandwich': 3, 'mak': 2, 'to': 1, 'good': 1, 'lat': 1, 'going': 1, 'me': 1, 'today': 2, 'soon': 1, 'hav': 3, 'nic': 2, 'a': 5, 'how': 3, 'lunch': 1, 'is': 2, 'what': 1, 'you': 4, 'it': 1, 'see': 1, 'can': 1, 'for': 1}\n",
      "Class words: {'sandwich': ['mak', 'me', 'a', 'sandwich', 'can', 'you', 'mak', 'a', 'sandwich', 'hav', 'a', 'sandwich', 'today', 'what', 'for', 'lunch'], 'goodbye': ['hav', 'a', 'nic', 'day', 'see', 'you', 'lat', 'hav', 'a', 'nic', 'day', 'talk', 'to', 'you', 'soon'], 'greeting': ['how', 'ar', 'you', 'how', 'is', 'yo', 'day', 'good', 'day', 'how', 'is', 'it', 'going', 'today']}\n"
     ]
    }
   ],
   "source": [
    "# capture unique stemmed words in the training corpus\n",
    "corpus_words = {}\n",
    "classes = list(set([a['class'] for a in training_data]))\n",
    "for c in classes:\n",
    "    class_words[c] = []\n",
    "    \n",
    "for data in training_data:\n",
    "    # tokenize each sentence into words\n",
    "    for word in nltk.word_tokenize(data['sentence']):\n",
    "        # ignore a few things\n",
    "        if word not in [\"?\", \"'s\"]:\n",
    "            # stem and lowercase each word\n",
    "            stemmed_word = stemmer.stem(word.lower())\n",
    "            if stemmed_word not in corpus_words:\n",
    "                corpus_words[stemmed_word] = 1\n",
    "            else:\n",
    "                corpus_words[stemmed_word] += 1\n",
    "                \n",
    "            class_words[data['class']].extend([stemmed_word])\n",
    "\n",
    "# we now have each word and the number of occurances of the word in our training corpus (the word's commonality)\n",
    "print (\"Corpus words and counts: %s\" % corpus_words)\n",
    "# also we have all words in each class\n",
    "print (\"Class words: %s\" % class_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# we can now calculate the Naive Bayes score for a new sentence\n",
    "sentence = \"good day for us to have lunch?\"\n",
    "\n",
    "# calculate a score for a given class\n",
    "def calculate_class_score(sentence, class_name):\n",
    "    score = 0\n",
    "    for word in nltk.word_tokenize(sentence):\n",
    "        if word in class_words[class_name]:\n",
    "            score += 1\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Class: sandwich  Score: 2\n",
      "Class: goodbye  Score: 2\n",
      "Class: greeting  Score: 2\n"
     ]
    }
   ],
   "source": [
    "# now we can find the class with the highest score\n",
    "for c in class_words.keys():\n",
    "    print (\"Class: %s  Score: %s\" % (c, calculate_class_score(sentence, c)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# calculate a score for a given class taking into account word commonality\n",
    "def calculate_class_score_commonality(sentence, class_name):\n",
    "    score = 0\n",
    "    for word in nltk.word_tokenize(sentence):\n",
    "        if word in class_words[class_name]:\n",
    "            score += (1 / corpus_words[word])\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Class: sandwich  Score: 2.0\n",
      "Class: goodbye  Score: 1.25\n",
      "Class: greeting  Score: 1.25\n"
     ]
    }
   ],
   "source": [
    "# now we can find the class with the highest score\n",
    "for c in class_words.keys():\n",
    "    print (\"Class: %s  Score: %s\" % (c, calculate_class_score_commonality(sentence, c)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# return the class with highest score for sentence\n",
    "def find_class(sentence):\n",
    "    high_class = None\n",
    "    high_score = 0\n",
    "    for c in class_words.keys():\n",
    "        score = calculate_class_score_commonality(sentence, c)\n",
    "        if score > high_score:\n",
    "            high_class = c\n",
    "            high_score = score\n",
    "    return high_class, high_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('sandwich', 2.0)"
      ]
     },
     "execution_count": 160,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "find_class(\"make me lunch?\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
